


'''
1.构建数据集
2.计算样本数据集分类标签为Y/N的概率
3.计算条件概率
4.利用后验概率计算先验概率
'''
#构建数据集
def createdataset():
    dataset=[
        [0,2,0,0,'n'],
        [0,2,0,1,'n'],
        [1,2,0,0,'y'],
        [2,1,0,0,'y'],
        [2,0,1,0,'y'],
        [2,0,1,1,'n'],
        [1,0,1,1,'y'],
        [0,1,0,0,'n'],
        [0,0,1,0,'y'],
        [2,1,1,0,'y'],
        [0,1,1,1,'y'],
        [1,1,0,1,'y'],
        [1,2,1,0,'y'],
        [2,1,0,1,'n']]
    labels=['age','income','job','credit']
    return dataset,labels
ds,lb=createdataset()
print(ds)

#计算样本数据集分类标签为y/n的概率
def prob(dataset,cls_val):
    cnt=0.0
    for e in dataset:
        if e[4]==cls_val:
            cnt+=1
    return cnt/len(dataset)

#计算条件概率
def conditionP(dataset,cls_val,arr_index,arr_val):
    cnt1=0.0
    cnt2=0.0
    for e in dataset:
        if e[4]==cls_val:
            cnt1+=1
            if e[arr_index]==arr_val:
                cnt2+=1
    return cnt2/cnt1

#利用后验概率计算先验概率
def NB(dataset,test,cls_y,cls_n):
    py=prob(dataset,cls_y)
    pn=prob(dataset,cls_n)
    for i,val in enumerate(test):
        py*=conditionP(dataset,cls_y,i,val)
        pn*=conditionP(dataset,cls_n,i,val)
    return {cls_y:py,cls_n:pn}

tst=[0,2,0,1]
prob=NB(ds,tst,'y','n')
print(prob)

#划分数据集为训练集和测试集
from sklearn.model_selection import train_test_split
ds_train,ds_test=train_test_split(
    ds, test_size=0.30, random_state=5)
#一般是3/7分
#random_state就是为了保证程序每次运行都分割一样的训练集和测试集
print('训练集:',ds_train)
print('测试集:',ds_test)

#模型测试与训练
#几种朴素贝叶斯分类器的区别在于对于分布的假设，即假设满足的形式
#高斯NB 最大似然估计 先验为高斯分布的朴素贝叶斯 样本特征的分布大部分是连续值
#特征是离散值，通常用样本的概率去估计 先验为多项式分布的朴素贝叶斯 样本特征的分布大部分是多元离散值
#先验为伯努利分布的朴素贝叶斯 样本特征是二元离散值或者很稀疏的多元离散值



from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
target=[]
data=[]
for e in ds:
    data.append(e[0:3])
    target.append(e[4])
# print(data)
# print(target)
clf1=MultinomialNB()
clf2=BernoulliNB()
clf3=GaussianNB()

X_train, X_test, y_train, y_test = train_test_split(data,target, test_size=0.3, random_state=4)
y_pred1 = clf1.fit(X_train, y_train).predict(X_test)
y_pred2 = clf2.fit(X_train, y_train).predict(X_test)
y_pred3 = clf3.fit(X_train, y_train).predict(X_test)
print('实际结果:',y_test)
print('MultinomialNB预测结果:',y_pred1)
print('BernoulliNB预测结果:',y_pred2)
print('GaussianNB预测结果:',y_pred3)
print(" MultinomialNB的Acc: %lf" % (clf1.score(X_test, y_test)))
print(" BernoulliNB的Acc: %lf" % (clf2.score(X_test, y_test)))
print(" GaussianNB的Acc: %lf" % (clf3.score(X_test, y_test)))

